## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.33 R6_2.5.1 fastmap_1.1.1 xfun_0.41
## [5] cachem_1.0.8 knitr_1.45 htmltools_0.5.7 rmarkdown_2.25
## [9] cli_3.6.1 sass_0.4.7 jquerylib_0.1.4 compiler_4.3.2
## [13] rstudioapi_0.15.0 tools_4.3.2 evaluate_0.23 bslib_0.5.1
## [17] yaml_2.3.7 rlang_1.1.2 jsonlite_1.8.7
## Loading required package: ggplot2
## Loading required package: lattice
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# List files in root dir
list.files()
## [1] "CaseStudy2.html" "CaseStudy2.pptx" "CaseStudy2.Rmd"
## [4] "CaseStudy2DDS.Rproj" "data" "README.md"
# Fetch data set from local folder
df_cs2 = read.csv("data/CaseStudy2-data.csv", header = TRUE)
df_cs2_no_attrition = read.csv("data/CaseStudy2CompSet No Attrition.csv", header = TRUE)
df_cs2_no_salary = read.csv("data/CaseStudy2CompSet No Salary.csv", header = TRUE)
## 'data.frame': 870 obs. of 36 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : chr "No" "No" "No" "No" ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : chr "Sales" "Research & Development" "Research & Development" "Sales" ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : chr "Life Sciences" "Medical" "Life Sciences" "Marketing" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : chr "Sales Executive" "Research Director" "Manufacturing Director" "Sales Executive" ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : chr "Divorced" "Single" "Single" "Married" ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "No" "No" "No" "No" ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
## ID Age Attrition BusinessTravel
## Min. : 1.0 Min. :18.00 Length:870 Length:870
## 1st Qu.:218.2 1st Qu.:30.00 Class :character Class :character
## Median :435.5 Median :35.00 Mode :character Mode :character
## Mean :435.5 Mean :36.83
## 3rd Qu.:652.8 3rd Qu.:43.00
## Max. :870.0 Max. :60.00
## DailyRate Department DistanceFromHome Education
## Min. : 103.0 Length:870 Min. : 1.000 Min. :1.000
## 1st Qu.: 472.5 Class :character 1st Qu.: 2.000 1st Qu.:2.000
## Median : 817.5 Mode :character Median : 7.000 Median :3.000
## Mean : 815.2 Mean : 9.339 Mean :2.901
## 3rd Qu.:1165.8 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :1499.0 Max. :29.000 Max. :5.000
## EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction
## Length:870 Min. :1 Min. : 1.0 Min. :1.000
## Class :character 1st Qu.:1 1st Qu.: 477.2 1st Qu.:2.000
## Mode :character Median :1 Median :1039.0 Median :3.000
## Mean :1 Mean :1029.8 Mean :2.701
## 3rd Qu.:1 3rd Qu.:1561.5 3rd Qu.:4.000
## Max. :1 Max. :2064.0 Max. :4.000
## Gender HourlyRate JobInvolvement JobLevel
## Length:870 Min. : 30.00 Min. :1.000 Min. :1.000
## Class :character 1st Qu.: 48.00 1st Qu.:2.000 1st Qu.:1.000
## Mode :character Median : 66.00 Median :3.000 Median :2.000
## Mean : 65.61 Mean :2.723 Mean :2.039
## 3rd Qu.: 83.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :100.00 Max. :4.000 Max. :5.000
## JobRole JobSatisfaction MaritalStatus MonthlyIncome
## Length:870 Min. :1.000 Length:870 Min. : 1081
## Class :character 1st Qu.:2.000 Class :character 1st Qu.: 2840
## Mode :character Median :3.000 Mode :character Median : 4946
## Mean :2.709 Mean : 6390
## 3rd Qu.:4.000 3rd Qu.: 8182
## Max. :4.000 Max. :19999
## MonthlyRate NumCompaniesWorked Over18 OverTime
## Min. : 2094 Min. :0.000 Length:870 Length:870
## 1st Qu.: 8092 1st Qu.:1.000 Class :character Class :character
## Median :14074 Median :2.000 Mode :character Mode :character
## Mean :14326 Mean :2.728
## 3rd Qu.:20456 3rd Qu.:4.000
## Max. :26997 Max. :9.000
## PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours
## Min. :11.0 Min. :3.000 Min. :1.000 Min. :80
## 1st Qu.:12.0 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80
## Median :14.0 Median :3.000 Median :3.000 Median :80
## Mean :15.2 Mean :3.152 Mean :2.707 Mean :80
## 3rd Qu.:18.0 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80
## Max. :25.0 Max. :4.000 Max. :4.000 Max. :80
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## Min. :0.0000 Min. : 0.00 Min. :0.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000
## Median :1.0000 Median :10.00 Median :3.000 Median :3.000
## Mean :0.7839 Mean :11.05 Mean :2.832 Mean :2.782
## 3rd Qu.:1.0000 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.0000 Max. :40.00 Max. :6.000 Max. :4.000
## YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.: 2.000 1st Qu.: 0.000
## Median : 5.000 Median : 3.000 Median : 1.000
## Mean : 6.962 Mean : 4.205 Mean : 2.169
## 3rd Qu.:10.000 3rd Qu.: 7.000 3rd Qu.: 3.000
## Max. :40.000 Max. :18.000 Max. :15.000
## YearsWithCurrManager
## Min. : 0.00
## 1st Qu.: 2.00
## Median : 3.00
## Mean : 4.14
## 3rd Qu.: 7.00
## Max. :17.00
## [1] 870 36
## ID Age Attrition
## 0 0 0
## BusinessTravel DailyRate Department
## 0 0 0
## DistanceFromHome Education EducationField
## 0 0 0
## EmployeeCount EmployeeNumber EnvironmentSatisfaction
## 0 0 0
## Gender HourlyRate JobInvolvement
## 0 0 0
## JobLevel JobRole JobSatisfaction
## 0 0 0
## MaritalStatus MonthlyIncome MonthlyRate
## 0 0 0
## NumCompaniesWorked Over18 OverTime
## 0 0 0
## PercentSalaryHike PerformanceRating RelationshipSatisfaction
## 0 0 0
## StandardHours StockOptionLevel TotalWorkingYears
## 0 0 0
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## 0 0 0
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## 0 0 0
# Count the number of employees with attrition, we will compare with rest of sample later
sum(df_cs2$Attrition == "Yes")
## [1] 140
### Company Profile Snapshot: Age, Gender, Education, Pay, Role ###
# Box plot age by department
box_plot_age_dept <- df_emp %>% ggplot(aes(Department, Age, fill = Department)) + geom_boxplot() + ggtitle(paste("Box plot of Age by Department, n = ", count(df_emp)))
ggplotly(box_plot_age_dept)
# Box plot years in current role by department
box_plot_yirc_dept <- df_emp %>% ggplot(aes(Department, YearsInCurrentRole, fill = Department)) + geom_boxplot() + ggtitle(paste("Box plot of Years In Current Role by Department, n = ", count(df_emp)))
ggplotly(box_plot_yirc_dept)
# Box plot age by job role
box_plot_age_role <- merge(df_emp, df_job) %>% ggplot(aes(JobRole, Age, fill = JobRole)) + geom_boxplot() + ggtitle(paste("Box plot of Age by Job Role, n = ", count(df_emp))) + theme(axis.text.x = element_text(angle = 15, vjust = 0.5, hjust = 1))
ggplotly(box_plot_age_role)
# Box plot years in current role by job role
box_plot_role_yicr <- merge(df_emp, df_job) %>% ggplot(aes(JobRole, YearsInCurrentRole, fill = JobRole)) + geom_boxplot() + ggtitle(paste("Box plot of Years in current role by Job Role, n = ", count(df_emp))) + xlab("Job Role") + ylab("Years in current role") + theme(axis.text.x = element_text(angle = 15, vjust = 0.5, hjust = 0.5))
ggplotly(box_plot_role_yicr)
# Bar plot job role by gender
bar_plot_dodge_role_gender <- merge(df_job, df_emp) %>% ggplot(aes(JobRole, fill = Gender)) + geom_bar(position = "dodge") + ggtitle(paste("Bar plot of Job Roles by Gender, n = ", count(df))) + xlab("Job Role") + ylab("Count") + theme(axis.text.x = element_text(angle = 15, vjust = 0.5, hjust = 0.5))
ggplotly(bar_plot_dodge_role_gender)
# Box plot of monthly income by job role
box_plot_income_role <- df %>% ggplot(aes(JobRole, MonthlyIncome, fill = JobRole)) + geom_boxplot() + ggtitle(paste("Box plot of Monthly Income by Job Role, n = ", count(df))) + xlab("Job Role") + ylab("Monthly Income (USD)") + theme(axis.text.x = element_text(angle = 15, vjust = 0.5, hjust = 0.5))
ggplotly(box_plot_income_role)
# Bar plot of job satisfaction by job role
bar_plot_dodge_job_sat <- df %>% ggplot(aes(JobSatisfaction, fill = JobRole)) + geom_bar(position = "dodge") + ggtitle(paste("Bar plot of Job Satisfaction by Job Roles, n = ", count(df))) + xlab("Job Satisfaction") + ylab("Count") + theme(axis.text.x = element_text(angle = 15, vjust = 0.5, hjust = 1))
ggplotly(bar_plot_dodge_job_sat)
# Histogram of monthly income
hist_wrap_income_role <- df %>% ggplot(aes(MonthlyIncome, fill = JobRole)) + geom_histogram(binwidth = 200) + ggtitle(paste("Histogram of Monthly Income by Job Role, n = ", count(df))) + xlab("Monthly Income (USD)") + ylab("Count") + theme(axis.text.x = element_text(angle = 15, vjust = 0, hjust = 0.1)) + facet_wrap(~JobRole)
ggplotly(hist_wrap_income_role)
# Histogram of job satisfaction by job role
hist_wrap_income_sat <- df %>% ggplot(aes(MonthlyIncome, fill = JobRole, position = "stack")) + geom_histogram(binwidth = 200) + ggtitle(paste("Histogram of Monthly Income by Job Satisfaction, n = ", count(df))) + xlab("Monthly Income (USD)") + ylab("Count") + facet_wrap(~JobSatisfaction)
ggplotly(hist_wrap_income_sat)
### Attritional Factors by Variables ###
# Bar plot of attrition by job role
hist_att_role <- merge(df_emp, df_job) %>% ggplot(aes(JobRole, fill = Attrition)) + geom_bar(position = "stack") + ggtitle(paste("Bar plot of Attrition by Job Role, n = ", count(df_emp))) + xlab("Job Role") + ylab("Count") + theme(axis.text.x = element_text(angle = 15, vjust = 0, hjust = 1))
ggplotly(hist_att_role)
# Bar plot of attrition v. gender by job role
hist_wrap_role_gender <- merge(df_emp, df_job) %>% ggplot(aes(Attrition, fill = Gender)) + geom_bar(position = "dodge") + ggtitle(paste("Bar plot of Attrition by Gender, n = ", count(df_emp))) + xlab("Attrition") + ylab("Count") + facet_wrap(~JobRole)
ggplotly(hist_wrap_role_gender)
# PercentHike v. Years In Current Role
plot_wrap_psh_yslp <- df_cs2 %>% ggplot(aes(YearsSinceLastPromotion, PercentSalaryHike)) + geom_point(aes(color = Attrition), size = 0.2, position = "jitter") + facet_wrap(~JobRole)
ggplotly(plot_wrap_psh_yslp)
# Scatter plot years since last promotion v. years in current role
plot_wrap_yslp_yicr <- df_cs2 %>% ggplot(aes(YearsSinceLastPromotion, YearsInCurrentRole)) + geom_point(aes(color = Attrition), size = 0.2, position = "jitter") + facet_wrap(~JobRole)
ggplotly(plot_wrap_yslp_yicr)
# *** Scatter plot years with current manager v. years in current role ***
plot_wrap_ywcm_yicr <- df_cs2 %>% ggplot(aes(YearsWithCurrManager, YearsInCurrentRole)) + geom_point(aes(color = Attrition), size = 0.2, position = "jitter") + facet_wrap(~JobRole)
ggplotly(plot_wrap_ywcm_yicr)
# Scatter plot years with current manager v. job satisfaction
plot_wrap_ywcm_js <- df_cs2 %>% ggplot(aes(YearsWithCurrManager, JobSatisfaction)) + geom_point(aes(color = Attrition), size = 0.2, position = "jitter") + facet_wrap(~JobRole)
ggplotly(plot_wrap_ywcm_js)
#source("analysis/knn.R")
# Classification - attrition by leadership management
set.seed(6)
split_percent <- 0.7
l_train <- df_cs2 %>%
select(Attrition, YearsWithCurrManager, YearsInCurrentRole)
trainIndices = sample(1:dim(l_train)[1], round(split_percent * dim(l_train)[1]))
train = l_train[trainIndices,]
test = l_train[-trainIndices,]
scatter_smooth_ywcm_yicr <- l_train %>% ggplot(aes(YearsWithCurrManager, YearsInCurrentRole, color = Attrition)) + geom_point(position = "jitter") + geom_smooth(aes(color = Attrition)) + ggtitle(paste("Plot of Years with current manager v. Years in current role, n = ", count(l_train)))
ggplotly(scatter_smooth_ywcm_yicr)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
classification <- knn(train[,2:3], test[,2:3], train$Attrition, prob = TRUE, k = 5)
table(classification, test$Attrition)
##
## classification No Yes
## No 216 43
## Yes 2 0
confusionMatrix(table(classification, test$Attrition))
## Confusion Matrix and Statistics
##
##
## classification No Yes
## No 216 43
## Yes 2 0
##
## Accuracy : 0.8276
## 95% CI : (0.7762, 0.8714)
## No Information Rate : 0.8352
## P-Value [Acc > NIR] : 0.6673
##
## Kappa : -0.0149
##
## Mcnemar's Test P-Value : 2.479e-09
##
## Sensitivity : 0.9908
## Specificity : 0.0000
## Pos Pred Value : 0.8340
## Neg Pred Value : 0.0000
## Prevalence : 0.8352
## Detection Rate : 0.8276
## Detection Prevalence : 0.9923
## Balanced Accuracy : 0.4954
##
## 'Positive' Class : No
##
# Classification - attrition by time duration
set.seed(6)
split_percent <- 0.7
l_train <- df_cs2 %>%
select(Attrition, YearsAtCompany, YearsInCurrentRole)
trainIndices = sample(1:dim(l_train)[1], round(split_percent * dim(l_train)[1]))
train = l_train[trainIndices,]
test = l_train[-trainIndices,]
scatter_smooth_yac_yicr <- l_train %>% ggplot(aes(YearsAtCompany, YearsInCurrentRole, color = Attrition)) + geom_point(position = "jitter") + geom_smooth(aes(color = Attrition)) + ggtitle(paste("Plot of Years at company v. years in current role, n = ", count(l_train)))
ggplotly(scatter_smooth_yac_yicr)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
classification <- knn(train[,2:3], test[,2:3], train$Attrition, prob = TRUE, k = 5)
table(classification, test$Attrition)
##
## classification No Yes
## No 218 43
## Yes 0 0
confusionMatrix(table(classification, test$Attrition))
## Confusion Matrix and Statistics
##
##
## classification No Yes
## No 218 43
## Yes 0 0
##
## Accuracy : 0.8352
## 95% CI : (0.7846, 0.8781)
## No Information Rate : 0.8352
## P-Value [Acc > NIR] : 0.5406
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8352
## Neg Pred Value : NaN
## Prevalence : 0.8352
## Detection Rate : 0.8352
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No
##
# Classification - attrition by peer relationship
set.seed(6)
split_percent <- 0.7
l_train <- df_cs2 %>%
select(Attrition, RelationshipSatisfaction, YearsWithCurrManager)
trainIndices = sample(1:dim(l_train)[1], round(split_percent * dim(l_train)[1]))
train = l_train[trainIndices,]
test = l_train[-trainIndices,]
scatter_smooth_rel_yicr <- l_train %>% ggplot(aes(RelationshipSatisfaction, YearsWithCurrManager, color = Attrition)) + geom_point(position = "jitter") + geom_smooth(aes(color = Attrition)) + ggtitle(paste("Plot of Number of companies v. years at company, n = ", count(l_train)))
classification <- knn(train[,2:3], test[,2:3], train$Attrition, prob = TRUE, k = 5)
table(classification, test$Attrition)
##
## classification No Yes
## No 218 43
## Yes 0 0
confusionMatrix(table(classification, test$Attrition))
## Confusion Matrix and Statistics
##
##
## classification No Yes
## No 218 43
## Yes 0 0
##
## Accuracy : 0.8352
## 95% CI : (0.7846, 0.8781)
## No Information Rate : 0.8352
## P-Value [Acc > NIR] : 0.5406
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 1.504e-10
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8352
## Neg Pred Value : NaN
## Prevalence : 0.8352
## Detection Rate : 0.8352
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No
##
#source("analysis/nb.R")
# Continuous predictor of attrition by leadership
df_attr <- df_cs2 %>%
select(ID, Attrition, YearsWithCurrManager, YearsInCurrentRole) %>%
mutate(ID = as.factor(ID), YearsInCurrentRole = as.factor(YearsInCurrentRole), YearsWithCurrManager = as.factor(YearsWithCurrManager))
df_attr %>% ggplot(aes(YearsWithCurrManager, YearsInCurrentRole)) + geom_point(aes(color = Attrition), position = "jitter", size = 0.3) + geom_smooth(aes(color = Attrition), size = 0.3)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
iterations = 100
m_acc <- matrix(nrow = iterations)
split_percent <- 0.7
for(i in 1:iterations) {
train_indices = sample(1:dim(df_attr)[1], round(split_percent * dim(df_attr)[1]))
train = df_attr[train_indices,]
test = df_attr[-train_indices,]
model = naiveBayes(train[,3:4], train$Attrition)
table(predict(model, test[,3:4]), test$Attrition)
cm <- confusionMatrix(table(predict(model,test[,3:4]), test$Attrition))
m_acc[i] <- cm$overall[1]
}
mean_acc <- colMeans(m_acc)
mean_acc
## [1] 0.8044828
#source("analysis/lm.R")
# Linear Model Monthly Income v. Monthly Rate
fit <- lm(MonthlyIncome~HourlyRate, data = df_pay)
df_pay %>% ggplot(aes(MonthlyIncome, MonthlyRate)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
beta_0_hat <- fit$coefficients[1]
beta_1_hat <- fit$coefficients[2]
SE_beta_0_hat <- summary(fit)$coefficients[1,2]
SE_beta_1_hat <- summary(fit)$coefficients[2,2]
# Intercept
tstat_int <- beta_0_hat / SE_beta_0_hat
pvalue_int <- (1-pt(tstat_int, length(df_pay$MonthlyIncome)-2)) * 2
tstat_int
## (Intercept)
## 11.94203
pvalue_int
## (Intercept)
## 0
# Slope
tstat_slope <- beta_1_hat / SE_beta_1_hat
pvalue_slope <- (pt(tstat_slope, length(df_pay)-2)) * 2
tstat_slope
## HourlyRate
## 0.0704479
pvalue_slope
## HourlyRate
## 1.054192
summary(fit)
##
## Call:
## lm(formula = MonthlyIncome ~ HourlyRate, data = df_pay)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5322 -3539 -1444 1779 13609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6354.4250 532.1060 11.94 <2e-16 ***
## HourlyRate 0.5462 7.7535 0.07 0.944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4600 on 868 degrees of freedom
## Multiple R-squared: 5.718e-06, Adjusted R-squared: -0.001146
## F-statistic: 0.004963 on 1 and 868 DF, p-value: 0.9439
confint(fit)
## 2.5 % 97.5 %
## (Intercept) 5310.06024 7398.78985
## HourlyRate -14.67154 15.76397
# Welch Two Sample t-test
t.test(df_pay$MonthlyIncome, df_pay$MonthlyRate)
##
## Welch Two Sample t-test
##
## data: df_pay$MonthlyIncome and df_pay$MonthlyRate
## t = -27.648, df = 1487.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8498.351 -7372.362
## sample estimates:
## mean of x mean of y
## 6390.264 14325.621
# Conduct Hypothesis Test
# LOOCV
pred_error_sq <- c(0)
for(i in 1:dim(df_pay)[1]) {
loocv_i <- df_pay[-i,]
fit <- lm(MonthlyIncome ~ MonthlyRate, data = loocv_i)
pred_i <- predict(fit, data.frame(MonthlyRate = df_pay[i,4]))
pred_error_sq <- pred_error_sq + (df_pay[i,4] - pred_i)^2
}
SSE <- var(df_pay$MonthlyIncome)
R_squared <- 1 - (pred_error_sq/SSE)
MSE <- pred_error_sq / length(df_pay)
RMSE <- sqrt(pred_error_sq/length(df_pay))
RMSE
## 1
## 102794.5